clear all
global system "linux"

if "${system}" == "linux" {
	*global code "/"
	global code "/"
	global s "/"
}

run "${code}${s}_set-path.do"

/* RADAR Query (loan level)
SELECT ind_id, io_flag, io_flag_curr, mort_type_id, lien_type_id, units_num, balloon_flag, appraisal_amount, document_type_id, fico, mcdash_id, loan_type_id, loan_to_value, occupancy_type_id, orig_amount,orig_date, 
pp_pen_flag,pp_term_num_mon,prop_state_id, prop_type_id,prop_zip_code,purpose_type_id_mcdash,term_num_mon,termination_date,termination_type_id, year_month_id, current_int_rate, delinq_hist_12mon, investor_type_id, 
mba_stat_id,mth_pi_pay_amount, prin_bal_amount, birth_year, zip_code, credit_score_risk3, fm_balance, fm_mthly_pmt, fm_number, ct_autob_balance, ct_autob_number, ct_autof_balance, ct_autof_number, primary_fg, confidence_level 

FROM 

(SELECT DISTINCT mcdash_id as loan_temp FROM crism.view_primary_cobwr_join_lps_crism INNER JOIN test.crism_tmp_20190121_032520 
ON crism.view_primary_cobwr_join_lps_crism.prop_zip_code = test.crism_tmp_20190121_032520.tmp_col 
WHERE year_month_id >= 200901 AND year_month_id <= 201612 AND term_num_mon=360 AND units_num IN('1') AND balloon_flag IN('N') AND lien_type_id IN('1') AND mort_type_id IN('1')  
AND int_type IN('1') AND prin_bal_amount > 417000 AND prin_bal_amount <= 517000 AND orig_amount > 430000 AND orig_date >= '2003-01-01' AND orig_date < '2016-12-31' ) 
as loan_list 

INNER JOIN crism.view_primary_cobwr_join_lps_crism ON crism.view_primary_cobwr_join_lps_crism.mcdash_id = loan_list.loan_temp 
WHERE year_month_id >= 200901 AND year_month_id <= 201612
*/

*** Select zip_codes to download
use "TreatedCounties.dta", clear
drop if inlist(floor(fipscode/1000), 2, 15, 43, 66, 78) // drop alaska, HI, guam, virgin islands, PR
keep fipscode cll2009

rename fipscode county
merge 1:m county using "ZIP_COUNTY_032010.dta", keep(master matched) nogen keepusing(prop_zip_code res_ratio)

sort prop_zip_code county cll2009
by prop_zip_code: egen highcost_ratio = total(res_ratio * (cll2009 > 467000))
keep if highcost_ratio > .9

keep prop_zip_code 
duplicates drop

tostring prop_zip_code, replace
replace prop_zip_code = "0" + prop_zip_code if length(prop_zip_code)<=4

export delimited "${temp}/high-cost-zip.csv", replace noquote novarnames

*** Import Crism
capture {
	import delimited "${LPS}/crism-loanlevel.csv", clear
	
	// fill-in static variables
	sort ind_id mcdash_id year_month_id
	local vars "orig_date orig_amount appraisal_amount term_num_mon prop_zip_code pp_pen_flag pp_term_num_mon fico loan_to_value io_flag"
	local vars "`vars' purpose_type_id_mcdash occupancy_type_id document_type_id investor_type_id loan_type_id"
	local vars "`vars' termination_date termination_type_id mort_type_id lien_type_id balloon_flag units_num"     

	foreach x of varlist `vars' {
		by ind_id mcdash_id: replace `x' = `x'[_n-1] if missing(`x') == 1
	}
	gsort ind_id mcdash_id -year_month_id
	foreach x of varlist `vars' {
		by ind_id mcdash_id: replace `x' = `x'[_n-1] if missing(`x') == 1
	}
	
	// clean dates
	foreach x of varlist orig_date termination_date {
		gen temp = mofd(date(`x', "YMD"))
		drop `x' 
		rename temp `x'
		format `x' %tm
	}

	// create the variable for the current date
	gen year = floor(year_month_id/ 100)
	gen month = year_month_id - year* 100
	gen data_as_of = ym(year, month)
	format data_as_of %tm
	drop year month year_month_id
	
	// Sum Auto account types (bank and finance)
	foreach x in b f {
		replace ct_auto`x'_bal = 0 if ct_auto`x'_num == 0
	}
	foreach x in bal num {
		egen auto_`x' = rowtotal(ct_auto*_`x')
	}
	drop ct_auto*
	

	save "${LPS}/crism-loanlevel.dta", replace
	erase "${LPS}/crism-loanlevel.csv"
}


*** Clean Data
use "${LPS}/crism-loanlevel.dta", clear


// main loans: loans with any observations above CLL
sort ind_id mcdash_id data_as_of
gen standard = balloon_flag=="N" & mort_type_id==1 & lien_type_id==1 & units_num==1 & term_num_mon==360
by ind_id mcdash_id: egen MainLoan = max(standard & (prin_bal_amount > 417000) & (prin_bal_amount ~=.))
keep if MainLoan == 1
drop standard balloon_flag  mort_type_id  lien_type_id  units_num MainLoan 

// drop individuals with more than 4 first mortgages (about 10% of the remaining sample)

sort ind_id mcdash_id data_as_of
by ind_id mcdash_id: egen max_fm_number = max(fm_number)
keep if max_fm_number <= 4
drop max_fm_number 


// drop any time periods before loan orig
drop if data_as_of < orig_date

// Updated Ltv
gen ym = orig_date
merge m:1 prop_zip_code ym using "HPI_timeseries.dta", keep(matched master) nogen // hpi at loan orignation
rename hpi hpi_orig
drop ym

gen ym = data_as_of
merge m:1 prop_zip_code ym using "HPI_timeseries.dta", keep(matched master) nogen // current hpi
drop ym

drop if data_as_of >= ym(2016, 9) // this is where HPI data ends
keep if data_as_of <= ym(2013, 6)
save "${LPS}/crism-loanlevel-cleaned.dta", replace


*** Freddie Mortgage Rate Data
import delimited "${LPS}/freddie_monthly_30yr_fixed_rate.csv", clear
gen data_as_of = mofd(date(date, "MDY"))
format data_as_of %tm
drop date
rename mortgage30us frm30rate
save "${LPS}/freddie_monthly_30yr_fixed_rate.dta", replace
